1 Bibliotecas

library(rmarkdown)
library(tidyverse) 
library(tidymodels)
library(ggplot2)
library(GGally)
library(plotly)
library(ggcorrplot)
library(ggfortify)
library(skimr)
library(caret)
library(factoextra)
library(reactable)
library(glmnet)
library(kknn)
library(ranger)
library(vip)
library(ROCR)
library(pROC)

2 Base de dados

data <- read.csv(file = 'audit_risk.csv')
skim(data)
Data summary
Name data
Number of rows 776
Number of columns 27
_______________________
Column type frequency:
character 1
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
LOCATION_ID 0 1 1 7 0 45 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Sector_score 0 1 20.18 24.32 1.85 2.37 3.89 55.57 59.85 ▇▁▁▁▃
PARA_A 0 1 2.45 5.68 0.00 0.21 0.88 2.48 85.00 ▇▁▁▁▁
Score_A 0 1 0.35 0.17 0.20 0.20 0.20 0.60 0.60 ▇▁▃▁▅
Risk_A 0 1 1.35 3.44 0.00 0.04 0.17 1.49 51.00 ▇▁▁▁▁
PARA_B 0 1 10.80 50.08 0.00 0.00 0.41 4.16 1264.63 ▇▁▁▁▁
Score_B 0 1 0.31 0.17 0.20 0.20 0.20 0.40 0.60 ▇▁▁▁▃
Risk_B 0 1 6.33 30.07 0.00 0.00 0.08 1.84 758.78 ▇▁▁▁▁
TOTAL 0 1 13.22 51.31 0.00 0.54 1.37 7.71 1268.91 ▇▁▁▁▁
numbers 0 1 5.07 0.26 5.00 5.00 5.00 5.00 9.00 ▇▁▁▁▁
Score_B.1 0 1 0.22 0.08 0.20 0.20 0.20 0.20 0.60 ▇▁▁▁▁
Risk_C 0 1 1.15 0.54 1.00 1.00 1.00 1.00 5.40 ▇▁▁▁▁
Money_Value 1 1 14.14 66.61 0.00 0.00 0.09 5.60 935.03 ▇▁▁▁▁
Score_MV 0 1 0.29 0.16 0.20 0.20 0.20 0.40 0.60 ▇▁▁▁▂
Risk_D 0 1 8.27 39.97 0.00 0.00 0.02 2.24 561.02 ▇▁▁▁▁
District_Loss 0 1 2.51 1.23 2.00 2.00 2.00 2.00 6.00 ▇▁▁▁▁
PROB 0 1 0.21 0.04 0.20 0.20 0.20 0.20 0.60 ▇▁▁▁▁
RiSk_E 0 1 0.52 0.29 0.40 0.40 0.40 0.40 2.40 ▇▁▁▁▁
History 0 1 0.10 0.53 0.00 0.00 0.00 0.00 9.00 ▇▁▁▁▁
Prob 0 1 0.21 0.04 0.20 0.20 0.20 0.20 0.60 ▇▁▁▁▁
Risk_F 0 1 0.05 0.31 0.00 0.00 0.00 0.00 5.40 ▇▁▁▁▁
Score 0 1 2.70 0.86 2.00 2.00 2.40 3.25 5.20 ▇▁▁▂▁
Inherent_Risk 0 1 17.68 54.74 1.40 1.58 2.21 10.66 801.26 ▇▁▁▁▁
CONTROL_RISK 0 1 0.57 0.44 0.40 0.40 0.40 0.40 5.80 ▇▁▁▁▁
Detection_Risk 0 1 0.50 0.00 0.50 0.50 0.50 0.50 0.50 ▁▁▇▁▁
Audit_Risk 0 1 7.17 38.67 0.28 0.32 0.56 3.25 961.51 ▇▁▁▁▁
Risk 0 1 0.39 0.49 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
df <- na.omit(data)
df <-  sapply(df, as.numeric)
df <- na.omit(df)

#Excluir variáveis cuja variância vale 0 e normalizar df

df <- as.data.frame(scale
      (as.data.frame(df[, apply(df,2,var) != 0]))) 

3 Análise Descritiva

3.1 Media e desvio padrão por grupo de Risco (0 e 1)

c <- df %>% 
  group_by(Risk) %>% 
  summarise_all(.funs = list(M=mean,ST=sd))
c <- round(c,2)
c <- as.data.frame(t(c))
colnames(c) <- c("Risk0","Risk1")
c <- na.omit(c)

names <- colnames(df[1:25])
cc<- tibble(Variavel=names[1:25],MEDIA0=c$Risk0[2:26],
            SD0=c$Risk0[27:51],
            MEDIA1=c$Risk1[2:26],SD1=c$Risk1[27:51])

fig <- plot_ly(data=cc, x = ~Variavel, 
               y = ~MEDIA0, type = 'bar',name="MEDIA0")
fig <- fig %>% add_trace(y = ~cc$SD0, name = 'SD0')
fig <- fig %>% add_trace(y = ~cc$MEDIA1, name = 'MEDIA1')
fig <- fig %>% add_trace(y = ~cc$SD1, name = 'SD1')
fig <- fig %>% layout(barmode = 'group')
fig <- fig %>% layout(title = "Media e desvio padraão por grupo de Risk (0 ou 1)"
                      ,yaxis = list(title=""))
fig

3.2 PCA

Analisando o PCA é possível determinar o melhor número de variáveis ou corte para ser utilizado.

df_pca <- df %>% select(-'Risk')
res_pca <- prcomp(df_pca)
fviz_eig(res_pca)

s <- summary(res_pca)
s <- as.data.frame(round(s$importance,3))

reactable(s,highlight = TRUE,resizable = TRUE,compact = TRUE)

3.3 Correlação entre variáveis

c <- df %>% 
  cor() %>% 
  ggcorrplot(lab = TRUE, lab_size = 2, tl.cex = 10, type = "upper",colors = c("red", "white", "blue"),show.diag = TRUE,digits = 1)
ggplotly(c)
corr <- function(x){cor(x,df$Risk)}
corre <- map_dbl(df %>% select(-'Risk'),corr)
corre <- as.data.frame(corre)

fig <- plot_ly(
  x = row.names(corre),
  y = corre$corre,
  text = row.names(corre),
  name = "Correlacao com Risco",
  type = 'bar', orientation = 'v',
  color = ~corre$corre,colors = c("red","white","blue"))

fig <- fig %>% layout(title = "Correlação com Risk",yaxis = list(title="Corrleção"))
fig

4 Tidymodels

4.1 Processamento em paralelo

library(doParallel)
all_cores <- parallel::detectCores(logical = TRUE)-2
cl <- makeCluster(all_cores)
registerDoParallel(cl)

4.2 Treinamento e Teste

df <-  as.data.frame(sapply(data, as.numeric)) %>% 
  mutate(Risk=factor(Risk))


split <- initial_split(df, prop = 0.7, strata = "Risk")

treinamento <- training(split)
teste <- testing(split)

cv_split <- vfold_cv(treinamento, v = 10, strata = "Risk")

4.3 Receita

receita <- recipe(Risk ~ ., treinamento) %>%
    step_naomit(all_numeric()) %>% 
    step_pca(all_numeric(), -all_outcomes() , num_comp = 15) %>% 
    step_center(all_numeric()) %>%
    step_scale(all_numeric()) 

   
receita_prep <- prep(receita)

treinamento_proc <- juice(receita_prep)

teste_proc <- bake(receita_prep, new_data = teste)


cv_split <- vfold_cv(treinamento, v = 10, strata = "Risk")

4.4 Lasso

lasso <- logistic_reg(penalty = tune()) %>% 
           set_engine("glmnet") %>% 
          set_mode("classification")

lambda_tune <- tune_grid(lasso,
                         receita,
                        resamples = cv_split,
                       metrics = metric_set(roc_auc, accuracy),
                      grid = 50)

autoplot(lambda_tune)

 best <- select_best(lambda_tune, "accuracy")
 lasso <- finalize_model(lasso, parameters = best)
 fit_lasso <- fit(lasso, Risk ~ ., data = treinamento_proc)
 
 pred_lasso <- predict(fit_lasso,teste_proc)

4.5 Random Forest

rf <- rand_forest(mtry=tune(),trees=tune()) %>% 
           set_engine("ranger",importance = "permutation") %>% 
          set_mode("classification") %>% 
          translate()


rf_tune <- tune_grid(rf,
                   receita,
                   resamples = cv_split,
                   metrics = metric_set(roc_auc, accuracy),
                   grid = 10)

autoplot(rf_tune)

best <- select_best(rf_tune, "roc_auc")
rf <- finalize_model(rf, parameters = best)
fit_rf <- fit(rf, Risk ~ ., data = treinamento_proc)

pred_rf<- predict(fit_rf,teste_proc)

4.6 KNN

knn <- nearest_neighbor(neighbors = tune()) %>% 
           set_engine("kknn") %>% 
          set_mode("classification")


knn_tune <- tune_grid(knn,
                         receita,
                        resamples = cv_split,
                       metrics = metric_set(roc_auc, accuracy),
                      grid = 50)


autoplot(knn_tune)

 best <- select_best(knn_tune, "roc_auc")
 knn <- finalize_model(knn, parameters = best)
 fit_knn <- fit(knn, Risk ~ ., data = treinamento_proc)
 
 pred_knn <- predict(fit_knn,teste_proc)

5 Compração entre modelos

stopCluster(cl)
test_train <- data.frame(Teste=as.numeric(teste_proc$Risk),
                         LASSO=as.numeric(pred_lasso$.pred_class),
                         RF=as.numeric(pred_rf$.pred_class),
                         KNN=as.numeric(pred_knn$.pred_class))

modelos <- colnames(test_train[,2:length(test_train)])

5.1 ROC

rocs <- roc(Teste ~ ., data = test_train)
ggplotly(ggroc(rocs))

5.2 EQM

EQMF <- function(x,y) mean((x-y)^2)
i <- seq(2,ncol(test_train))
EQM <-sapply(i,function(x) EQMF(test_train[,x],
                                               test_train$Teste))

fig <- plot_ly(x=~reorder(modelos,EQM),y=EQM,type="bar",name="EQM")
fig <- fig %>% layout(title = "EQM x Modelo",xaxis = list(title="Modelo"),
                      yaxis = list(title="EQM"))
fig